rm(list=ls())
require(quanteda)
require(LSX)
require(stringi)
require(tidyverse)
library(readtext)
source("functions.R")

quanteda_options(threads = 60)

# data import
dat_nabaa <- readtext(("data/Nabaa/*.txt"), docvarsfrom = "filenames", 
                      docvarnames = "date", encoding="UTF-8")
dat_nabaa <- data.frame(dat_nabaa)
saveRDS(dat_nabaa, file = "data/dat_nabaa.RDS")

dat_sumud <- readtext(("data/Sumud/*.txt"), docvarsfrom = "filenames", 
                      docvarnames = "date", encoding="UTF-8")
dat_sumud <- data.frame(dat_sumud)
saveRDS(dat_sumud, file = "data/dat_sumud.RDS")

#dat_umma <- readtext(("data/One_umma/*.txt"), docvarsfrom = "filenames", 
#                      docvarnames = "date", encoding="UTF-8")
#dat_umma <- data.frame(dat_umma)
#saveRDS(dat_umma, file = "data/dat_umma.RDS")


# data convert
dat_nabaa <- readRDS("data/dat_nabaa.RDS")
dat_nabaa$publication <- "Nabaa"
dat_nabaa$docid <- paste0("Nabaa_", seq_len(nrow(dat_nabaa)))

dat_sumud <- readRDS("data/dat_sumud.RDS")
dat_sumud$publication <- "Sumud"
dat_sumud$docid <- paste0("Sumud_", seq_len(nrow(dat_sumud)))

#dat_umma <- readRDS("data/dat_umma.RDS")
#dat_umma$publication <- "Umma"
#dat_umma$docid <- paste0("Umma_", seq_len(nrow(dat_umma)))

#dat <- rbind(dat_nabaa, dat_sumud, dat_umma)
dat <- rbind(dat_nabaa, dat_sumud)

# date
dat$date <- as.Date(stri_match_first_regex(dat$date, "\\d{4}-\\d{2}-\\d{2}"))
dat$year <- lubridate::year(dat$date)
dat$month <- lubridate::month(dat$date)
dat <- subset(dat, 2014 <= year & year <= 2023)

dat$doc_id <- dat$docid
dat$docid <- factor(dat$docid, levels = unique(dat$docid))

# corpus
corp <- corpus(dat, text_field = "text")
corp2 <- corpus_reshape(corp, text_field = "text", "sentences")

saveRDS(corp, "data/data_corpus.RDS")
saveRDS(corp2, "data/data_corpus_sent.RDS")
